Introduction

The report describes a limma differential expression analysis of melanoma samples from the latest 24Q4 Depmap bulk RNAseq gene expression data, and creates lists of DE genes between the average gene expression in tumor samples, such as those derived from various organs colonized by melanoma metastases: (e.g. lung, liver, central nervous system, etc.).

This analysis of the MetMap data is motivated by the following premise:

What are the gene programs the drive melanoma metastases?

NOTE: 1) This report version uses “raw” gene counts and not TPM counts! 1) All DE comparisons in this report are made relative to primary tumor (i.e. “primary”).

Background

The main questions that we would like to answer with the Depmap RNAseq data are as follows:

  1. What is the difference between cell lines with “tropism” in liver versus lungs?
  2. Which are the top genes in each metastatic-colonized organ, based on the cell line status?
  3. Find genes unique in metastatic cell lines (for each metastatic organ) versus primary tumor cell lines?
  4. Can we identify gene regulatory networks specific for each organ and across organs? (to do later, in a following report, perhaps using SCENIC.)

For more about how the Depmap data was originally processed, please refer to these papers or web pages: 1) original Depmap paper 1) Depmap documentation page 1) Depmap 24Q4 release statment

Data Preparation

We have pre-downloaded and semi-processed the Depmap melanoma metadata and bulk count matrix, which we load here:

## latest count matrix
readr::read_csv("./data/OmicsDefaultModelProfiles.csv") %>%
  dplyr::rename_with(janitor::make_clean_names) %>%
  dplyr::rename(depmap_id = model_id) %>%
  dplyr::select(-profile_type) %>%
  as.data.frame() -> map_depmap_id_to_patient_id

## latest metadata, filter for melanoma datasets only
readr::read_csv("./data/Model.csv") %>%
  dplyr::rename(depmap_id = names(.)[1]) %>%
  dplyr::select(-c(PatientSubtypeFeatures, TissueOrigin:PublicComments,
                   PublicComments, HCMIID:COSMICID, contains("Source"))) %>%
  dplyr::select(depmap_id, CCLEName, everything()) %>%
  dplyr::rename_with(janitor::make_clean_names) %>%
  dplyr::filter(grepl("Melanoma", oncotree_primary_disease)) %>%
  as.data.frame() -> meta_data
# View(meta_data)

## latest count data
readr::read_csv("./data/OmicsExpressionTranscriptsExpectedCountProfile.csv") %>%
  dplyr::rename(profile_id = names(.)[1]) %>%
  dplyr::left_join(map_depmap_id_to_patient_id, by = "profile_id") %>%
  dplyr::filter(depmap_id %in% meta_data$depmap_id) %>%
  dplyr::select(-profile_id) %>%
  tidyr::pivot_longer(cols = -depmap_id,
                      names_to = "gene_transcript",
                      values_to = "count") %>%
  dplyr::mutate(gene_name = stringr::str_extract(gene_transcript, "^[^\\s]+")) %>%
  dplyr::group_by(depmap_id, gene_name) %>%
  dplyr::summarise(count = sum(as.numeric(count)), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = gene_name, values_from = count) %>%
  tibble::column_to_rownames("depmap_id") %>%
  t() %>% as.data.frame() -> cts

## make metadata consistent with count matrix
meta_data %>%
  dplyr::arrange(depmap_id) %>% 
  dplyr::filter(depmap_id %in% colnames(cts)) -> meta_data2

## save finished files
saveRDS(meta_data2, file = "./data/new_depmap_melanoma_metadata.rds")
saveRDS(cts, file = "./data/new_depmap_melanoma_counts.rds")

Below is a table displaying the metadata for all melanoma samples in Depmap:

# read_excel("./data/Supplementary Table 03 MetMap cell line annotation.xlsx") %>% 
#   dplyr::filter(cancer_type == "melanoma") %>%
#   dplyr::arrange(depmap_id) %>%
#   as.data.frame() -> MetMap_melanoma_metadata

readRDS(file = "./data/metmap_melanoma_merged_data.rds") %>%
  dplyr::filter(!duplicated(depmap_id)) %>%
  as.data.frame() -> metmap_merged

## latest metadata
readRDS(file = "./data/new_depmap_melanoma_metadata.rds") %>%
  # dplyr::filter(depmap_id %in% MetMap_melanoma_metadata$depmap_id) %>% 
  dplyr::mutate(across(primary_or_metastasis, ~ ifelse(is.na(.), "Unknown", .)),
                met_site_group = factor(
                  gsub(" ", "_",
                       ifelse(primary_or_metastasis == "Primary",
                              "primary", tolower(sample_collection_site))))) %>%
  dplyr::left_join(metmap_merged, by = "depmap_id") %>%
  dplyr::mutate(
    combined_class = as.factor(tolower(coalesce(tropic_class, primary_or_metastasis)))) %>%
  tibble::column_to_rownames(var = "depmap_id") %>%
  as.data.frame() -> meta_data
# dim(meta_data)

## latest counts
readRDS(file = "./data/new_depmap_melanoma_counts.rds") %>%
  # dplyr::select(meta_data$depmap_id) %>%
  as.data.frame() -> cts
# dim(cts)

## save the converted metadata table ( for Takis)
# meta_data %>%
#   tibble::rownames_to_column(var = "depmap_id") %>%
#   writexl::write_xlsx(path = "./data/depmap_melanoma_metadata_clean.xlsx")

meta_data %>%
  DT::datatable()

Table breaking down sample organ distribution:

table(meta_data$met_site_group)
#> 
#>                            ascites             central_nervous_system 
#>                                  2                                  3 
#> haematopoietic_and_lymphoid_tissue                          left_heel 
#>                                  1                                  1 
#>                              liver                               lung 
#>                                  2                                  3 
#>                         lymph_node                   pleural_effusion 
#>                                 35                                  2 
#>                            primary                          sinonasal 
#>                                 41                                  1 
#>                               skin                    small_intestine 
#>                                 32                                  2 
#>                               uvea 
#>                                  1

Table breaking down annotated metastases type distribution:

table(meta_data$combined_class)
#> 
#>       aggressive_metastatic          broadly_metastatic 
#>                          10                           6 
#> liver_and_kidney_metastatic   liver_and_lung_metastatic 
#>                           1                           5 
#>                liver_tropic    lung_and_bone_metastatic 
#>                           3                           1 
#>                 lung_tropic                  metastatic 
#>                           4                          56 
#>              non_metastatic                     primary 
#>                           3                          28 
#>                     unknown             weak_metastatic 
#>                           3                           6

Filter Raw Counts

## Step 1: filter out genes with all-zero counts
# cts_filtered <- cts[rowSums(cts) > 0, ]
# keep <- rowSums(cpm(cts) > 1) >= 5
min_samples <- ceiling(0.2 * nrow(meta_data))  # about 25 samples
keep <- rowSums(cpm(cts) > 1) >= min_samples
cts_filtered <- cts[keep, ]
dim(cts_filtered)
#> [1] 15744   126

Preliminary Visualizations

We display a few visualizations.

Donut Plot

Visualization of number of metastatic samples by organ sample site:

# Summarize counts
meta_data %>%
  # dplyr::filter(primary_or_metastasis == "Metastatic") %>%
  dplyr::count(sample_collection_site) %>%
  dplyr::arrange(desc(sample_collection_site)) %>%
  dplyr::mutate(prop = n / sum(n) * 100,
                ypos = cumsum(prop) - 0.5 * prop) %>%
  as.data.frame() -> df_counts

# Create donut plot
df_counts %>%
  ggplot(aes(x = 2, y = prop, fill = sample_collection_site)) +
    geom_bar(stat = "identity", width = 1, color = "black") +
    coord_polar(theta = "y") +
    xlim(0.5, 2.5) +
    theme_void() +
    geom_text_repel(aes(x = 2.5, y = ypos, label = paste0(
      sample_collection_site, "\n", round(prop, 1), "%")), color = "black", size = 4) +
    ggtitle("Distribution of Sample Collection Sites") +
    theme(plot.title = element_text(hjust = 0.5),
          legend.position = "none")

Bar Plot

Visualization of number of metastatic samples by organ sample site:

meta_data %>%
  # dplyr::filter(primary_or_metastasis == "Metastatic") %>%
  dplyr::count(sample_collection_site) %>%
  as.data.frame() -> metadata_summary

# Create bar plot
metadata_summary %>%
ggplot(aes(x = sample_collection_site, y = n, fill = sample_collection_site)) +
  geom_bar(stat = "identity") +
  labs(
    title = "Distribution of Sample Collection Sites (Metastatic Samples Only)",
    x = "Sample Type", y = "Count") +
  theme_classic() +
  theme(legend.position = "none") +
  theme(axis.text.x = element_text(angle = 45, hjust=1))

Raw Counts – Density Plot

## Plot raw count density (log scale to visualize better)
as.data.frame(cts) %>%
  tibble::rownames_to_column("gene") %>%
  tidyr::pivot_longer(-gene, names_to = "sample", values_to = "raw_count") %>%
  ggplot(aes(x = raw_count + 1, group = sample)) +
    geom_density(alpha = 0.3, color = "grey50") +
    scale_x_log10() +
    labs(title = "Density Plot of Raw Counts (Before Filtering)",
         x = "Raw Counts (log10 scale)", y = "Density") +
    theme_minimal()

logCPM – Before Filtering

## Calculate logCPM from raw counts before filtering
dge_all <- DGEList(counts = cts)
dge_all <- calcNormFactors(dge_all)
logCPM_all <- cpm(dge_all, log = TRUE)

## Convert to long format
logCPM_long_all <- as.data.frame(logCPM_all) %>%
  tibble::rownames_to_column("gene") %>%
  tidyr::pivot_longer(-gene, names_to = "sample", values_to = "logCPM")

## Density plot
ggplot(logCPM_long_all, aes(x = logCPM, group = sample)) +
  geom_density(alpha = 0.3, color = "grey40") +
  labs(title = "logCPM Density (Before Filtering)", x = "log2(CPM)", y = "Density") +
  theme_minimal()

## Optional: Boxplot
ggplot(logCPM_long_all, aes(x = sample, y = logCPM)) +
  geom_boxplot(outlier.size = 0.5) +
  labs(title = "logCPM Distribution (Before Filtering)",
       x = "Sample", y = "log2(CPM)") +
  theme_minimal() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

logCPM – After Filtering

## Use filtered counts (already done in your code)
dge_filt <- DGEList(counts = cts_filtered)
dge_filt <- calcNormFactors(dge_filt)
logCPM_filt <- cpm(dge_filt, log = TRUE)

## Convert to long format
logCPM_long_filt <- as.data.frame(logCPM_filt) %>%
  tibble::rownames_to_column("gene") %>%
  tidyr::pivot_longer(-gene, names_to = "sample", values_to = "logCPM")

## Density plot
ggplot(logCPM_long_filt, aes(x = logCPM, group = sample)) +
  geom_density(alpha = 0.3, color = "darkblue") +
  labs(title = "logCPM Density (After Filtering)", x = "log2(CPM)", y = "Density") +
  theme_minimal()

## Optional: Boxplot
ggplot(logCPM_long_filt, aes(x = sample, y = logCPM)) +
  geom_boxplot(outlier.size = 0.5, fill = "lightblue") +
  labs(title = "logCPM Distribution (After Filtering)",
       x = "Sample", y = "log2(CPM)") +
  theme_minimal() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())

2D PCA

We do not observe good separation between class labels.

# Step 1: Filter out lowly expressed genes
# keep <- rowSums(cpm(cts) > 1) >= 5
# cts_filtered <- cts[keep, ]
# min_samples <- ceiling(0.2 * nrow(meta_data))  # about 25 samples
# keep <- rowSums(cpm(cts) > 1) >= min_samples
# cts_filtered <- cts[keep, ]

# Step 2: TMM normalization and log transformation
dge <- DGEList(counts = cts_filtered)
dge <- calcNormFactors(dge)
logCPM <- cpm(dge, log = TRUE)

# Step 3: PCA
pca <- prcomp(t(logCPM), scale. = TRUE)  # transpose to make samples rows

# Step 4: Build data frame for ggplot
pca_df <- as.data.frame(pca$x)  # principal components
pca_df$sample <- colnames(cts_filtered)

# Step 5: Plot the PCA
pca_df %>% 
  tibble::rownames_to_column(var = "depmap_id") %>%
  left_join(meta_data %>% tibble::rownames_to_column(var = "depmap_id"), by = "depmap_id") %>%
  dplyr::mutate(cell_line = gsub("_.*", "", ccle_name)) %>%
  ggplot(aes(x = PC1, y = PC2, label = cell_line, color = combined_class)) +
    geom_point(size = 2) +
    geom_text(size = 2, vjust = -1) +
    theme_minimal() +
    labs(title = "PCA of logCPM-normalized expression",
         x = paste0("PC1 (", round(summary(pca)$importance[2, 1] * 100, 1), "% variance)"),
         y = paste0("PC2 (", round(summary(pca)$importance[2, 2] * 100, 1), "% variance)")) +
    theme(legend.position = "right")

3D PCA

pca_df %>% 
  tibble::rownames_to_column(var = "depmap_id") %>%
  left_join(meta_data %>% tibble::rownames_to_column(var = "depmap_id"), by = "depmap_id") %>%
  dplyr::mutate(cell_line = gsub("_.*", "", ccle_name)) %>%
  plotly::plot_ly(
    x = ~PC1,
    y = ~PC2,
    z = ~PC3,
    color = ~combined_class, 
    type = "scatter3d",
    mode = "markers",
    marker = list(size = 4)) %>%
    layout(
      title = "3D PCA Plot",
      scene = list(
        xaxis = list(title = "PC1"),
        yaxis = list(title = "PC2"),
        zaxis = list(title = "PC3")))

Sample Correlation Heatmap

Computes Pearson correlations between samples.

cor_matrix <- cor(logCPM_filt, method = "pearson")

annotation_col <- meta_data %>%
  dplyr::select(met_site_group, combined_class) %>%
  as.data.frame()
rownames(annotation_col) <- rownames(meta_data)

# Basic correlation heatmap with annotation
pheatmap(cor_matrix,
         annotation_col = annotation_col,
         show_rownames = FALSE,
         show_colnames = FALSE,
         clustering_distance_rows = "correlation",
         clustering_distance_cols = "correlation",
         clustering_method = "complete",
         color = colorRampPalette(c("blue", "white", "red"))(100),
         main = "Sample-Sample Correlation Heatmap (logCPM)")

Limma Expression Analysis

NOTE: “primary tumor” samples are the “normative” condition meaning that all genes with positive LFC are highter in this condition and genes with negative LFC are higher in the experimental conditions (e.g. lung).

Create the design matrix

### Step 2: Create a DGEList object
dge <- DGEList(counts = cts_filtered)

# Optional: TMM normalization (especially if library sizes vary)
dge <- calcNormFactors(dge)

### Step 3: Create the design matrix
# design <- model.matrix(~ 0 + met_site_group, data = meta_data)
# colnames(design) <- levels(meta_data$met_site_group)
design <- model.matrix(~ 0 + combined_class, data = meta_data)
# colnames(design) <- levels(meta_data$combined_class)
# unique(meta_data$combined_class)

### Step 4: Create contrast matrix
# contrast.matrix <- makeContrasts(
#   ## primary as normative
#   primary_vs_liver                     = liver_tropic - primary,
#   primary_vs_lung_tropic               = lung_tropic - primary,
#   primary_vs_aggressive_metastatic     = aggressive_metastatic - primary,
#   primary_vs_broadly_metastatic        = broadly_metastatic - primary,
#   primary_vs_weak_metastatic           = weak_metastatic - primary,
#   primary_vs_non_metastatic            = non_metastatic - primary,
#   primary_vs_liver_and_lung_metastatic = liver_and_lung_metastatic - primary,
#   ## skin as normative
#   # skin_vs_lung                         = lung - skin,
#   # skin_vs_liver                        = liver - skin,
#   # skin_vs_lymph_node                   = lymph_node - skin,
#   # skin_vs_pleural_effusion             = pleural_effusion - skin,
#   # skin_vs_central_nervous_system       = central_nervous_system - skin,
#   # skin_vs_ascites                      = ascites - skin,
#   # skin_vs_small_intestine              = small_intestine - skin,
#   # ## lymph_node as normative
#   # lymph_node_vs_lung                   = lung - lymph_node,
#   # lymph_node_vs_liver                  = liver - lymph_node,
#   # lymph_node_vs_lymph_node             = lymph_node - lymph_node,
#   # lymph_node_vs_pleural_effusion       = pleural_effusion - lymph_node,
#   # lymph_node_vs_central_nervous_system = central_nervous_system - lymph_node,
#   # lymph_node_vs_ascites                = ascites - lymph_node,
#   # lymph_node_vs_small_intestine        = small_intestine - lymph_node,
#   levels = design)

contrast.matrix <- makeContrasts(
  primary_vs_liver_tropic             = combined_classliver_tropic - combined_classprimary,
  primary_vs_lung_tropic              = combined_classlung_tropic - combined_classprimary,
  primary_vs_aggressive_metastatic    = combined_classaggressive_metastatic - combined_classprimary,
  primary_vs_broadly_metastatic       = combined_classbroadly_metastatic - combined_classprimary,
  primary_vs_weak_metastatic          = combined_classweak_metastatic - combined_classprimary,
  primary_vs_non_metastatic           = combined_classnon_metastatic - combined_classprimary,
  primary_vs_liver_and_lung_metastatic = combined_classliver_and_lung_metastatic - combined_classprimary,
  levels = design
)

Fit Model and Perform DEA

### Step 5: Apply voom transformation (with mean-variance trend plot)
v <- voom(dge, design, plot = TRUE)


### Step 6: Fit the model and apply contrasts
fit <- lmFit(v, design)
fit2 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit2)

Annotate DE Results

We annotate the DE genes with Ensembl and Entrez IDs from Biomart.

getBM(attributes = c("ensembl_gene_id", "entrezgene_id", "external_gene_name"),
      mart = useDataset("hsapiens_gene_ensembl", useMart("ensembl"))) %>%
  dplyr::rename(ensembl_id = ensembl_gene_id,
                entrez_id = entrezgene_id,
                gene_name = external_gene_name) %>%
  dplyr::filter(stringr::str_length(gene_name) > 1,
                !duplicated(ensembl_id),
                !duplicated(gene_name)) %>%
  saveRDS(file = "./data/human_biomart.rds")
## read the table that maps gene symbols to entrex and ensembl ids
human_biomart <- readRDS(file = "./data/human_biomart.rds")

# Extract top differentially expressed genes
topTable(fit2, coef = "primary_vs_liver_tropic", adjust.method = "BH",
         number = nrow(cts_filtered)) %>%
  tibble::rownames_to_column(var = "gene_name") %>% 
  dplyr::left_join(human_biomart, by = "gene_name") %>%
  # dplyr::left_join(
  #   cts_filtered %>%
  #     tibble::rownames_to_column(var = "gene_name") %>%
  #     dplyr::rowwise() %>%
  #     dplyr::mutate(
  #       skin_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "liver") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id))),
  #       primary_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "primary") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id)))) %>%
  #     dplyr::ungroup() %>%
  #     dplyr::select(gene_name, contains("exp")),
  #   by = "gene_name") %>%
  dplyr::select(gene_name, contains("id"), everything()) -> liver_de

topTable(fit2, coef = "primary_vs_lung_tropic", adjust.method = "BH",
         number = nrow(cts_filtered)) %>%
  tibble::rownames_to_column(var = "gene_name") %>% 
  dplyr::left_join(human_biomart, by = "gene_name") %>%
  # dplyr::left_join(
  #   cts_filtered %>% 
  #     tibble::rownames_to_column(var = "gene_name") %>%
  #     dplyr::rowwise() %>%
  #     dplyr::mutate(
  #       skin_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "lung") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id))),
  #       primary_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "primary") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id)))) %>%
  #     dplyr::ungroup() %>%
  #     dplyr::select(gene_name, contains("exp")),
  #   by = "gene_name") %>%
  dplyr::select(gene_name, contains("id"), everything()) -> lung_de

topTable(fit2, coef = "primary_vs_aggressive_metastatic", adjust.method = "BH",
         number = nrow(cts_filtered)) %>%
  tibble::rownames_to_column(var = "gene_name") %>% 
  dplyr::left_join(human_biomart, by = "gene_name") %>%
  # dplyr::left_join(
  #   cts_filtered %>% 
  #     tibble::rownames_to_column(var = "gene_name") %>%
  #     dplyr::rowwise() %>%
  #     dplyr::mutate(
  #       skin_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "aggressive_metastatic") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id))),
  #       primary_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "primary") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id)))) %>%
  #     dplyr::ungroup() %>%
  #     dplyr::select(gene_name, contains("exp")),
  #   by = "gene_name") %>%
  dplyr::select(gene_name, contains("id"), everything()) -> aggressive_metastatic_de

topTable(fit2, coef = "primary_vs_broadly_metastatic", adjust.method = "BH",
         number = nrow(cts_filtered)) %>%
  tibble::rownames_to_column(var = "gene_name") %>% 
  dplyr::left_join(human_biomart, by = "gene_name") %>%
  # dplyr::left_join(
  #   cts_filtered %>% 
  #     tibble::rownames_to_column(var = "gene_name") %>%
  #     dplyr::rowwise() %>%
  #     dplyr::mutate(
  #       skin_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "broadly_metastatic") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id))),
  #       primary_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "primary") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id)))) %>%
  #     dplyr::ungroup() %>%
  #     dplyr::select(gene_name, contains("exp")),
  #   by = "gene_name") %>%
  dplyr::select(gene_name, contains("id"), everything()) -> broadly_metastatic_de

topTable(fit2, coef = "primary_vs_weak_metastatic", adjust.method = "BH",
         number = nrow(cts_filtered)) %>%
  tibble::rownames_to_column(var = "gene_name") %>% 
  dplyr::left_join(human_biomart, by = "gene_name") %>% 
  # dplyr::left_join(
  #   cts_filtered %>% 
  #     tibble::rownames_to_column(var = "gene_name") %>%
  #     dplyr::rowwise() %>%
  #     dplyr::mutate(
  #       skin_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "weak_metastatic") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id))),
  #       primary_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "primary") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id)))) %>%
  #     dplyr::ungroup() %>%
  #     dplyr::select(gene_name, contains("exp")),
  #   by = "gene_name") %>%
  dplyr::select(gene_name, contains("id"), everything()) -> weak_metastatic_de

topTable(fit2, coef = "primary_vs_non_metastatic", adjust.method = "BH",
         number = nrow(cts_filtered)) %>%
  tibble::rownames_to_column(var = "gene_name") %>% 
  dplyr::left_join(human_biomart, by = "gene_name") %>%
  # dplyr::left_join(
  #   cts_filtered %>% 
  #     tibble::rownames_to_column(var = "gene_name") %>%
  #     dplyr::rowwise() %>%
  #     dplyr::mutate(
  #       skin_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "non_metastatic") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id))),
  #       primary_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "primary") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id)))) %>%
  #     dplyr::ungroup() %>%
  #     dplyr::select(gene_name, contains("exp")),
  #   by = "gene_name") %>%
  dplyr::select(gene_name, contains("id"), everything()) -> non_metastatic_de

topTable(fit2, coef = "primary_vs_liver_and_lung_metastatic", adjust.method = "BH",
         number = nrow(cts_filtered)) %>%
  tibble::rownames_to_column(var = "gene_name") %>%
  dplyr::left_join(human_biomart, by = "gene_name") %>%
  # dplyr::left_join(
  #   cts_filtered %>% 
  #     tibble::rownames_to_column(var = "gene_name") %>%
  #     dplyr::rowwise() %>%
  #     dplyr::mutate(
  #       skin_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "liver_and_lung_metastatic") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id))),
  #       primary_mean_exp = mean(c_across(
  #         meta_data %>%
  #           dplyr::filter(combined_class == "primary") %>%
  #           tibble::rownames_to_column(var = "depmap_id") %>%
  #           dplyr::pull(depmap_id)))) %>%
  #     dplyr::ungroup() %>%
  #     dplyr::select(gene_name, contains("exp")),
  #   by = "gene_name") %>%
  dplyr::select(gene_name, contains("id"), everything()) -> liver_and_lung_metastatic_de

# topTable(fit2, coef = "primary_vs_small_intestine", adjust.method = "BH",
#          number = nrow(cts_filtered)) %>%
#   tibble::rownames_to_column(var = "gene_name") %>% 
#   dplyr::left_join(human_biomart, by = "gene_name") %>%
#   dplyr::left_join(
#     cts_filtered %>% 
#       tibble::rownames_to_column(var = "gene_name") %>%
#       dplyr::rowwise() %>%
#       dplyr::mutate(
#         skin_mean_exp = mean(c_across(
#           meta_data %>%
#             dplyr::filter(combined_class == "small_intestine") %>%
#             tibble::rownames_to_column(var = "depmap_id") %>%
#             dplyr::pull(depmap_id))),
#         primary_mean_exp = mean(c_across(
#           meta_data %>%
#             dplyr::filter(combined_class == "primary") %>%
#             tibble::rownames_to_column(var = "depmap_id") %>%
#             dplyr::pull(depmap_id)))) %>%
#       dplyr::ungroup() %>%
#       dplyr::select(gene_name, contains("exp")),
#     by = "gene_name") %>%
#   dplyr::select(gene_name, contains("id"), everything()) -> si_de

Results

A Volcano plot (log2 fold change vs -log10 p-value) helps identify genes that display large magnitude changes and high significance.

Primary Tumor vs MetMap Liver Metastases

limma::plotMA(fit2, coef = 1)

liver_de %>%
  dplyr::filter(!is.na(P.Value)) %>%
  dplyr::mutate(
    negLogP = -log10(P.Value),
    sig = ifelse(adj.P.Val < 0.05 & abs(logFC) > 1,
                 "Significant", "Not Significant")) -> volcano_data
volcano_data %>%
  ggplot(aes(x = logFC, y = negLogP, color = sig)) +
  geom_point(size = 0.75, alpha = 0.75) +
  geom_text_repel(data = subset(volcano_data, sig == "Significant"),
                  aes(label = gene_name), size = 3, #max.overlaps = Inf,
                  box.padding = 0.3, point.padding = 0.3, segment.color = "grey50") +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "black")) +
  geom_hline(yintercept = -log10(0.05), color = "blue", linetype = "dashed") +
  geom_vline(xintercept = c(-1, 1), color = "blue", linetype = "dashed") +
  theme_bw() +
  ggtitle(paste0("Volcano Plot ", gsub("_", " ", names(as.data.frame(contrast.matrix))[1])))

Primary Tumor vs MetMap Lung Metastases

limma::plotMA(fit2, coef = 2)

lung_de %>%
  dplyr::filter(!is.na(P.Value)) %>%
  dplyr::mutate(
    negLogP = -log10(P.Value),
    sig = ifelse(adj.P.Val < 0.05 & abs(logFC) > 1,
                 "Significant", "Not Significant")) -> volcano_data
volcano_data %>%
  ggplot(aes(x = logFC, y = negLogP, color = sig)) +
  geom_point(size = 0.75, alpha = 0.75) +
  geom_text_repel(data = subset(volcano_data, sig == "Significant"),
                  aes(label = gene_name), size = 3, # max.overlaps = Inf,
                  box.padding = 0.3, point.padding = 0.3, segment.color = "grey50") +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "black")) +
  geom_hline(yintercept = -log10(0.05), color = "blue", linetype = "dashed") +
  geom_vline(xintercept = c(-1, 1), color = "blue", linetype = "dashed") +
  theme_bw() +
  ggtitle(paste0("Volcano Plot ", gsub("_", " ", names(as.data.frame(contrast.matrix))[2])))

Primary Tumor vs MetMap Aggressive Metastases

limma::plotMA(fit2, coef = 3)

aggressive_metastatic_de %>%
  dplyr::filter(!is.na(P.Value)) %>%
  dplyr::mutate(
    negLogP = -log10(P.Value),
    sig = ifelse(adj.P.Val < 0.05 & abs(logFC) > 1,
                 "Significant", "Not Significant")) -> volcano_data
volcano_data %>%
  ggplot(aes(x = logFC, y = negLogP, color = sig)) +
  geom_point(size = 0.75, alpha = 0.75) +
  geom_text_repel(data = subset(volcano_data, sig == "Significant"),
                  aes(label = gene_name), size = 3, #max.overlaps = Inf,
                  box.padding = 0.3, point.padding = 0.3, segment.color = "grey50") +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "black")) +
  geom_hline(yintercept = -log10(0.05), color = "blue", linetype = "dashed") +
  geom_vline(xintercept = c(-1, 1), color = "blue", linetype = "dashed") +
  theme_bw() +
  ggtitle(paste0("Volcano Plot ", gsub("_", " ", names(as.data.frame(contrast.matrix))[3])))

Primary Tumor vs MetMap Broad Metastases

limma::plotMA(fit2, coef = 4)

broadly_metastatic_de %>%
  dplyr::filter(!is.na(P.Value)) %>%
  dplyr::mutate(
    negLogP = -log10(P.Value),
    sig = ifelse(adj.P.Val < 0.05 & abs(logFC) > 1,
                 "Significant", "Not Significant")) -> volcano_data
volcano_data %>%
  ggplot(aes(x = logFC, y = negLogP, color = sig)) +
  geom_point(size = 0.75, alpha = 0.75) +
  geom_text_repel(data = subset(volcano_data, sig == "Significant"),
                  aes(label = gene_name), size = 3, #max.overlaps = Inf,
                  box.padding = 0.3, point.padding = 0.3, segment.color = "grey50") +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "black")) +
  geom_hline(yintercept = -log10(0.05), color = "blue", linetype = "dashed") +
  geom_vline(xintercept = c(-1, 1), color = "blue", linetype = "dashed") +
  theme_bw() +
  ggtitle(paste0("Volcano Plot ", gsub("_", " ", names(as.data.frame(contrast.matrix))[4])))

Primary Tumor vs MetMap Weak Metastases

limma::plotMA(fit2, coef = 5)

weak_metastatic_de %>%
  dplyr::filter(!is.na(P.Value)) %>%
  dplyr::mutate(
    negLogP = -log10(P.Value),
    sig = ifelse(adj.P.Val < 0.05 & abs(logFC) > 1,
                 "Significant", "Not Significant")) -> volcano_data
volcano_data %>%
  ggplot(aes(x = logFC, y = negLogP, color = sig)) +
  geom_point(size = 0.75, alpha = 0.75) +
  geom_text_repel(data = subset(volcano_data, sig == "Significant"),
                  aes(label = gene_name), size = 3, #max.overlaps = Inf,
                  box.padding = 0.3, point.padding = 0.3, segment.color = "grey50") +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "black")) +
  geom_hline(yintercept = -log10(0.05), color = "blue", linetype = "dashed") +
  geom_vline(xintercept = c(-1, 1), color = "blue", linetype = "dashed") +
  theme_bw() +
  ggtitle(paste0("Volcano Plot ", gsub("_", " ", names(as.data.frame(contrast.matrix))[5])))

Primary Tumor vs MetMap Non-Metastases

limma::plotMA(fit2, coef = 6)

non_metastatic_de %>%
  dplyr::filter(!is.na(P.Value)) %>%
  dplyr::mutate(
    negLogP = -log10(P.Value),
    sig = ifelse(adj.P.Val < 0.05 & abs(logFC) > 1,
                 "Significant", "Not Significant")) -> volcano_data
volcano_data %>%
  ggplot(aes(x = logFC, y = negLogP, color = sig)) +
  geom_point(size = 0.75, alpha = 0.75) +
  geom_text_repel(data = subset(volcano_data, sig == "Significant"),
                  aes(label = gene_name), size = 3, #max.overlaps = Inf,
                  box.padding = 0.3, point.padding = 0.3, segment.color = "grey50") +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "black")) +
  geom_hline(yintercept = -log10(0.05), color = "blue", linetype = "dashed") +
  geom_vline(xintercept = c(-1, 1), color = "blue", linetype = "dashed") +
  theme_bw() +
  ggtitle(paste0("Volcano Plot ", gsub("_", " ", names(as.data.frame(contrast.matrix))[6])))

Primary Tumor vs MetMap Liver and Lung Metastases

limma::plotMA(fit2, coef = 7)

liver_and_lung_metastatic_de %>%
  dplyr::filter(!is.na(P.Value)) %>%
  dplyr::mutate(
    negLogP = -log10(P.Value),
    sig = ifelse(adj.P.Val < 0.05 & abs(logFC) > 1,
                 "Significant", "Not Significant")) -> volcano_data
volcano_data %>%
  ggplot(aes(x = logFC, y = negLogP, color = sig)) +
  geom_point(size = 0.75, alpha = 0.75) +
  geom_text_repel(data = subset(volcano_data, sig == "Significant"),
                  aes(label = gene_name), size = 3, #max.overlaps = Inf,
                  box.padding = 0.3, point.padding = 0.3, segment.color = "grey50") +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "black")) +
  geom_hline(yintercept = -log10(0.05), color = "blue", linetype = "dashed") +
  geom_vline(xintercept = c(-1, 1), color = "blue", linetype = "dashed") +
  theme_bw() +
  ggtitle(paste0("Volcano Plot ", gsub("_", " ", names(as.data.frame(contrast.matrix))[7])))

Save Results

We save the results as an RDS file and as Excel spreadsheet with a different DE gene list on each Excel sheet, with each sheet named according to that DE comparison.

## make list
de_list <- list(liver_de, lung_de, aggressive_metastatic_de, broadly_metastatic_de,
                weak_metastatic_de, non_metastatic_de, liver_and_lung_metastatic_de)

## name list elements
names(de_list) <- c(
  "primary_vs_liver_tropic",
  "primary_vs_lung_tropic",
  "primary_vs_aggressive_met",
  "primary_vs_broadly_mets",
  "primary_vs_weak_met",
  "primary_vs_non_met",
  "primary_vs_liver_and_lung_met")

## write Excel
write_xlsx(de_list, path = paste0("./data/", project_name, "_", Sys.Date(), ".xlsx"))

## write RDS
saveRDS(de_list, file = paste0("./data/", project_name, "_", Sys.Date(), ".rds"))

Session Info

sessionInfo()
#> R version 4.5.0 (2025-04-11)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Europe/Brussels
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] grid      stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] plotly_4.10.4      pheatmap_1.0.13    ggvenn_0.1.10      DT_0.33           
#>  [5] biomaRt_2.65.0     edgeR_4.7.2        limma_3.65.1       writexl_1.5.4     
#>  [9] readxl_1.4.5       readr_2.1.5        ggrepel_0.9.6.9999 ggplot2_3.5.2     
#> [13] tibble_3.3.0       tidyr_1.3.1        dplyr_1.1.4       
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.1     viridisLite_0.4.2    farver_2.1.2        
#>  [4] blob_1.2.4           filelock_1.0.3       Biostrings_2.77.1   
#>  [7] fastmap_1.2.0        lazyeval_0.2.2       BiocFileCache_2.99.5
#> [10] digest_0.6.37        lifecycle_1.0.4      statmod_1.5.0       
#> [13] KEGGREST_1.49.0      RSQLite_2.4.1        magrittr_2.0.3      
#> [16] compiler_4.5.0       rlang_1.1.6          sass_0.4.10         
#> [19] progress_1.2.3       tools_4.5.0          yaml_2.3.10         
#> [22] data.table_1.17.4    knitr_1.50           labeling_0.4.3      
#> [25] prettyunits_1.2.0    htmlwidgets_1.6.4    bit_4.6.0           
#> [28] curl_6.3.0           xml2_1.3.8           RColorBrewer_1.1-3  
#> [31] withr_3.0.2          purrr_1.0.4          BiocGenerics_0.55.0 
#> [34] stats4_4.5.0         scales_1.4.0         dichromat_2.0-0.1   
#> [37] cli_3.6.5            rmarkdown_2.29       crayon_1.5.3        
#> [40] generics_0.1.4       rstudioapi_0.17.1    httr_1.4.7          
#> [43] tzdb_0.5.0           DBI_1.2.3            cachem_1.1.0        
#> [46] stringr_1.5.1        AnnotationDbi_1.71.0 cellranger_1.1.0    
#> [49] XVector_0.49.0       vctrs_0.6.5          jsonlite_2.0.0      
#> [52] IRanges_2.43.0       hms_1.1.3            S4Vectors_0.47.0    
#> [55] bit64_4.6.0-1        crosstalk_1.2.1      locfit_1.5-9.12     
#> [58] jquerylib_0.1.4      glue_1.8.0           stringi_1.8.7       
#> [61] gtable_0.3.6         GenomeInfoDb_1.45.4  UCSC.utils_1.5.0    
#> [64] pillar_1.10.2        rappdirs_0.3.3       htmltools_0.5.8.1   
#> [67] R6_2.6.1             dbplyr_2.5.0         httr2_1.1.2         
#> [70] evaluate_1.0.3       lattice_0.22-7       Biobase_2.69.0      
#> [73] png_0.1-8            memoise_2.0.1        bslib_0.9.0         
#> [76] Rcpp_1.0.14          xfun_0.52            pkgconfig_2.0.3